from bs4 import BeautifulSoup
from books.book_spider import BookSpider
from utils import replace_filename_invalid_chars


class ShuqugeComSpider(BookSpider):
    '''书趣阁小说爬虫'''

    def _get_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        dds = soup.find('div', class_='listmain').find_all('dd')[12:]
        _dict = dict()
        for dd in dds:
            a = dd.find('a')
            title = a.text
            if title.find('章 ') == -1:
                title = title.replace('章', '章 ')
            title = replace_filename_invalid_chars(title)
            url = a['href']
            tid = self._get_article_id(url)
            url = self._url.replace('index.html', '') + url
            title = f'{tid}_{title}'
            _dict[title] = url
        return _dict
    
    def _remove_content_invalid_chars(self, content):
        import re
        content = content.replace('\xa0', ' ')
        content = content.replace('    ', '')
        content = content.replace('  ', '')
        content = content.replace(' ', '')
        content = re.sub('https.*html', '', content)
        content = re.sub('请记住.*com', '', content)
        content = content.replace('最新网址:www.ishuquge.com', '')
        return content
    